#!/usr/bin/env python
# coding: utf-8

# # Reading data sets
# - IC50
# - Genomic Featuers
# - Drug Decoder

# In[1]:


get_ipython().run_line_magic('pylab', 'inline')
matplotlib.rcParams['figure.figsize'] = (10,6)


# In[2]:


# let us import some functions
from gdsctools import IC50, DrugDecode, GenomicFeatures
# and data sets
from gdsctools import ic50_test, genomic_features_test
from gdsctools.datasets import testing


# ## IC50

# The first type of data set to be used in the anlaysis is the matrix of IC50. There is a test file called **ic50_test** that gives the location of such a file
# 

# In[3]:


ic50 = IC50(ic50_test)


# In[4]:


print(ic50)


# In[5]:


data = ic50.plot_ic50_count(marker='o')
title("Count of valid IC50 values per drug")


# In[6]:


data = ic50.hist()


# In[7]:


drug_to_drop  = ['Drug_999_IC50', 'Drug_1047_IC50', 'Drug_1049_IC50',
                'Drug_1050_IC50', 'Drug_1052_IC50', 'Drug_1053_IC50']
dummy = ic50.drop_drugs(drug_to_drop)
data = ic50.hist()


# ## Genomic Features

# In[8]:


f = GenomicFeatures() # default from the package


# This is equivalent to 

# In[9]:


f = GenomicFeatures(genomic_features_test)


# In[10]:


print(f)


# Note that this GenomicFeatures matrix must have 3 special columns
# to provide the sample name, Tissue Factor Value and MSI factor value. 
# Then all features.

# In[11]:


f.df.iloc[0:3]


# In[12]:


df = f.plot()


# In[13]:


groups = f.df.groupby('TISSUE_FACTOR').groups
to_remove = []
for tissue in groups.keys():
    if len(groups[tissue])<40:
        to_remove.append(tissue)


# In[14]:


info = f.drop_tissue_in(to_remove)
f.plot()


# ## Drug Decoder

# GDSCTools provides an IC50 test file (ic50_test). The drug identifiers are
# usually encoded  with a unique identifier that have no meaning. A decoder
# file may be provided. for example, we provide the drug_test data set

# In[15]:


print(testing.drug_test_csv)


# In[16]:


dd = DrugDecode(testing.drug_test_csv)
print(dd)


# It can be used to retrive the name and target of the drug

# In[17]:


dd.get_name('Drug_1047_IC50')


# In[18]:


dd.get_target('Drug_1047_IC50')


# In[ ]: